import numpy as np
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.metrics import classification_report
from scipy.stats import mode
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import plotly.offline as pyo
pyo.init_notebook_mode()
# Local imports
from ipynb.fs.defs.task3_1 import DatasetManager
from ipynb.fs.defs.task3_2 import ModelManager, plot_bar_data, plot_collection
# ModelManager class is modified to accomodate new clustering models
class ModelManager3(ModelManager):
def __init__(self, feature_set, targets):
super().__init__(feature_set, targets)
def train_model_reg(self):
"""
Trains a KMeans clustering model for
regression tasks.
"""
# Getting training and test data
X_train = self._train_and_test_sets.get("X_train")
y_train = self._train_and_test_sets.get("y_train")
X_test = self._train_and_test_sets.get("X_test")
y_test = self._train_and_test_sets.get("y_test")
# Fit the KMeans model on the training data
print("Fitting model...")
km = MiniBatchKMeans(n_clusters=50, batch_size=50)
km.fit(X_train)
print("Model fitting complete...")
# Assign instances in training set to closest cluster and get mean target value of each cluster
print("Making predictions...")
train_labels = km.predict(X_train)
cluster_means = [y_train[train_labels == i].mean() for i in range(km.n_clusters)]
train_preds = np.array([cluster_means[i] for i in train_labels])
# Predict clusters of the test set and assign mean target value of corresponding cluster to each instance
test_labels = km.predict(X_test)
test_preds = np.array([cluster_means[i] for i in test_labels])
self._train_preds = train_preds
self._test_preds = test_preds
self._trained_model = km
def train_model_clf(self, n_classes):
"""
Trains a KMeans clustering model for
classification tasks.
"""
assert self._train_and_test_sets != None, "You don't have your training and test sets."
# Getting training and test data
X_train = self._train_and_test_sets.get("X_train")
y_train = self._train_and_test_sets.get("y_train")
X_test = self._train_and_test_sets.get("X_test")
y_test = self._train_and_test_sets.get("y_test")
# Fit the KMeans model on the training data
print("Fitting model...")
km = MiniBatchKMeans(n_clusters=n_classes, batch_size=n_classes)
km.fit(X_train)
print("Model fitting complete...")
# Predict clusters on the training set
print("Making predictions...")
train_preds = km.predict(X_train)
# Predict clusters on the test set
test_preds = km.predict(X_test)
# Assign class variables
self._trained_model = km
self._train_preds = train_preds
self._test_preds = test_preds
def visualise_results_clf(self):
"""
Creates a series of plots to visualise performance
results for a classification model.
"""
assert self._trained_model != None, "You haven't trained a model yet."
# Getting training, test and predictions data
y_train = self._train_and_test_sets.get("y_train")
y_test = self._train_and_test_sets.get("y_test")
train_preds = self._train_preds
test_preds = self._test_preds
# Get key metric plot
key_metric_plot = self._get_key_metric_plot_clf(y_train, y_test, train_preds, test_preds)
# True values vs predictions
true_pred_plot = self._get_true_pred_plot(y_test, test_preds)
# Combining plots
plots = {
(1,1,"Dataset",""): key_metric_plot,
(2,1,"","Class"): true_pred_plot,
}
subplot_titles = [
"Key Metrics",
"True vs Predicted Values",
]
specs = [
[{"type": "bar", "colspan": 2}, None],
[{"type": "xy", "colspan": 2}, None],
]
combined_plot = plot_collection(
plots,
rows=2,
cols=2,
subplot_titles=subplot_titles,
specs=specs,
title="Model Performance Results",
)
return combined_plot
# Productivity dataset; using optimal configuration as determined in Task3-1
gwp_dsm = DatasetManager("gwp_assessment")
gwp_dsm.load_and_preprocess([0,1,2,3], "iterative")
gwp_dsm.create_feature_set(7)
gwp_dsm.scale_feature_set()
# Star dataset; using optimal configuration as determined in Task3-1
star_dsm = DatasetManager("star_assessment")
star_dsm.load_and_preprocess([0,1,8,9,12,16,17], "knn")
star_dsm.create_feature_set(8)
star_dsm.scale_feature_set()
Dataset loaded... Dataset cleaned.. Dataset encodings.. Dataset numerised... Missing values imputed... Dataset loaded... Dataset cleaned.. Dataset encodings.. Dataset numerised... Missing values imputed...
# Productivity dataset
gwp_features = gwp_dsm.get_scaled_feat_ds()
gwp_targets = gwp_dsm.get_complete_ds()[:, -1]
# Star dataset
star_features = star_dsm.get_scaled_feat_ds()
star_targets = star_dsm.get_complete_ds()[:, -1]
# GWP dataset
gwp_mm = ModelManager3(gwp_features, gwp_targets)
# Star dataset
star_mm = ModelManager3(star_features, star_targets)
Methodology
Evaluation metrics
Productivity dataset: accuracy, precision, recall, F1 score. These metrics are ideal metrics for evaluating classification models as they provide comprehensive insight into a model's performance. Accuracy helps understand the overall effectiveness of the model. However, it can be misleading in imbalanced datasets, which is where precision and recall come in. They provide a more nuanced view of the model's ability to correctly identify positive instances and avoid false positives. The F1 score harmonises precision and recall, offering a single metric that seeks a balance between these two characteristics, making it especially useful when the costs of false positives and false negatives are significantly different.
Star dataset: mean squared error (MSE), mean abolute error (MAE), R2 score. These are robust metrics for evaluating regression models, with each illuminating different aspects of model performance. MSE emphasizes larger errors by squaring residuals, making it useful when larger errors are undesirable. MAE provides a more straightforward measure of average error magnitude, regardless of direction. The R2 score complements these by providing a relative measure of how much variance the model can explain, giving a broader picture of model performance beyond just raw error. These combined provide a comprehensive assessment of the model's effectiveness.
Notes
# Splitting productivity dataset
gwp_mm.split_dataset(train_size=0.8, test_size=0.2)
# Splitting star dataset
star_mm.split_dataset(train_size=0.016, test_size=0.004)
# Productivity dataset
gwp_mm.train_model_reg()
Fitting model... Model fitting complete... Making predictions...
/Users/bhekimaenetja/.local/share/virtualenvs/small-projects-ai-NRjJWIjk/lib/python3.9/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 3 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
# Star dataset
star_mm.train_model_clf(3)
Fitting model... Model fitting complete... Making predictions...
/Users/bhekimaenetja/.local/share/virtualenvs/small-projects-ai-NRjJWIjk/lib/python3.9/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 3 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
# Star dataset
gwp_mm.visualise_results_reg()
# Star dataset
star_mm.visualise_results_clf()
Productivity dataset
Star dataset
# Splitting productivity dataset
gwp_mm.split_dataset(train_size=0.75, test_size=0.25)
# Splitting star dataset
star_mm.split_dataset(train_size=0.015, test_size=0.005)
# Productivity dataset
gwp_mm.train_model_reg()
Fitting model... Model fitting complete... Making predictions...
/Users/bhekimaenetja/.local/share/virtualenvs/small-projects-ai-NRjJWIjk/lib/python3.9/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 3 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
# Star dataset
star_mm.train_model_clf(3)
Fitting model... Model fitting complete... Making predictions...
/Users/bhekimaenetja/.local/share/virtualenvs/small-projects-ai-NRjJWIjk/lib/python3.9/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 3 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
# Star dataset
gwp_mm.visualise_results_reg()
# Star dataset
star_mm.visualise_results_clf()
Productivity dataset
Star dataset
# Splitting productivity dataset
gwp_mm.split_dataset(train_size=0.7, test_size=0.3)
# Splitting star dataset
star_mm.split_dataset(train_size=0.014, test_size=0.006)
# Productivity dataset
gwp_mm.train_model_reg()
Fitting model... Model fitting complete... Making predictions...
/Users/bhekimaenetja/.local/share/virtualenvs/small-projects-ai-NRjJWIjk/lib/python3.9/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 3 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
# Star dataset
star_mm.train_model_clf(3)
Fitting model... Model fitting complete... Making predictions...
/Users/bhekimaenetja/.local/share/virtualenvs/small-projects-ai-NRjJWIjk/lib/python3.9/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 3 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
# Star dataset
gwp_mm.visualise_results_reg()
# Star dataset
star_mm.visualise_results_clf()
Productivity dataset
Star dataset
# Splitting productivity dataset
gwp_mm.split_dataset(train_size=0.6, test_size=0.4)
# Splitting star dataset
star_mm.split_dataset(train_size=0.012, test_size=0.008)
# Productivity dataset
gwp_mm.train_model_reg()
Fitting model... Model fitting complete... Making predictions...
/Users/bhekimaenetja/.local/share/virtualenvs/small-projects-ai-NRjJWIjk/lib/python3.9/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 3 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
# Star dataset
star_mm.train_model_clf(3)
Fitting model... Model fitting complete... Making predictions...
/Users/bhekimaenetja/.local/share/virtualenvs/small-projects-ai-NRjJWIjk/lib/python3.9/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 3 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
# Star dataset
gwp_mm.visualise_results_reg()
# Star dataset
star_mm.visualise_results_clf()
Productivity dataset
Star dataset
# Splitting productivity dataset
gwp_mm.split_dataset(train_size=0.5, test_size=0.5)
# Splitting star dataset
star_mm.split_dataset(train_size=0.01, test_size=0.01)
# Productivity dataset
gwp_mm.train_model_reg()
Fitting model... Model fitting complete... Making predictions...
/Users/bhekimaenetja/.local/share/virtualenvs/small-projects-ai-NRjJWIjk/lib/python3.9/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 3 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
# Star dataset
star_mm.train_model_clf(3)
Fitting model... Model fitting complete... Making predictions...
/Users/bhekimaenetja/.local/share/virtualenvs/small-projects-ai-NRjJWIjk/lib/python3.9/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 3 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
# Star dataset
gwp_mm.visualise_results_reg()
# Star dataset
star_mm.visualise_results_clf()
Productivity dataset
Star dataset
Productivity dataset
Star dataset
When it comes to the star dataset, the classification models (both SVM and MLP) have the clear advantage over the clustering models. Across all metrics (accuracy, precision, recall, F1 score), the classification models score roughly 20-30 percentage points higher than the clustering models. This is primarily because the classification models leverage known labels during training, which guides the models to correct their mistakes, hence refining their decision boundaries over time. The clustering models, on the other hand, rely purely on data patterns and distributions to make their groupings; this has the potential to create severe inaccuracies. Additionally, the structured approach of the classification models, especially when dealing with a small number of distinct classes, allows for the clear differentiation and better handling of complex relationships between features, thus yielding better results. Lastly, the classification models benefit from a range of techniques like regularisation, boosting, or bagging to prevent overfitting, and they are able to manage class imbalance better than the clustering models, thereby improving their overall performance.